import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
data= pd.read_csv("/content/drive/MyDrive/Datasets/PublicHealthAppData/fr.openfoodfacts.org.products.csv",sep='\t')
/usr/local/lib/python3.8/dist-packages/IPython/core/interactiveshell.py:3326: DtypeWarning: Columns (0,3,5,19,20,24,25,26,27,28,35,36,37,38,39,48) have mixed types.Specify dtype option on import or set low_memory=False. exec(code_obj, self.user_global_ns, self.user_ns)
data.shape
(320772, 162)
data.describe(include="all")
code | url | creator | created_t | created_datetime | last_modified_t | last_modified_datetime | product_name | generic_name | quantity | ... | ph_100g | fruits-vegetables-nuts_100g | collagen-meat-protein-ratio_100g | cocoa_100g | chlorophyl_100g | carbon-footprint_100g | nutrition-score-fr_100g | nutrition-score-uk_100g | glycemic-index_100g | water-hardness_100g | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 3.207490e+05 | 320749 | 320770 | 3.207690e+05 | 320763 | 3.207720e+05 | 320772 | 303010 | 52795 | 104819 | ... | 49.000000 | 3036.000000 | 165.000000 | 948.000000 | 0.0 | 268.000000 | 221210.000000 | 221210.000000 | 0.0 | 0.0 |
unique | 3.206380e+05 | 320749 | 3535 | 1.896360e+05 | 189568 | 1.806390e+05 | 180495 | 221347 | 38584 | 13826 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
top | 2.446306e+10 | http://world-fr.openfoodfacts.org/produit/0000... | usda-ndb-import | 1.489077e+09 | 2017-03-09T10:37:09Z | 1.439142e+09 | 2015-08-09T17:35:42Z | Ice Cream | Pâtes alimentaires au blé dur de qualité supér... | 500 g | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
freq | 2.000000e+00 | 1 | 169868 | 2.000000e+01 | 20 | 3.300000e+01 | 33 | 410 | 201 | 4669 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
mean | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 6.425698 | 31.458587 | 15.412121 | 49.547785 | NaN | 341.700764 | 9.165535 | 9.058049 | NaN | NaN |
std | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 2.047841 | 31.967918 | 3.753028 | 18.757932 | NaN | 425.211439 | 9.055903 | 9.183589 | NaN | NaN |
min | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 0.000000 | 0.000000 | 8.000000 | 6.000000 | NaN | 0.000000 | -15.000000 | -15.000000 | NaN | NaN |
25% | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 6.300000 | 0.000000 | 12.000000 | 32.000000 | NaN | 98.750000 | 1.000000 | 1.000000 | NaN | NaN |
50% | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 7.200000 | 23.000000 | 15.000000 | 50.000000 | NaN | 195.750000 | 10.000000 | 9.000000 | NaN | NaN |
75% | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 7.400000 | 51.000000 | 15.000000 | 64.250000 | NaN | 383.200000 | 16.000000 | 16.000000 | NaN | NaN |
max | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 8.400000 | 100.000000 | 25.000000 | 100.000000 | NaN | 2842.000000 | 40.000000 | 40.000000 | NaN | NaN |
11 rows × 162 columns
print(np.round(100*(data.isna().sum().sum())/((data.isna().sum().sum())+(data.notna().sum().sum()))),"% of missing data")
76.0 % of missing data
print("categorical features:",len(data.describe(include="object").columns.tolist()))
print(data.describe(include="object").columns.tolist())
categorical features: 56 ['code', 'url', 'creator', 'created_t', 'created_datetime', 'last_modified_t', 'last_modified_datetime', 'product_name', 'generic_name', 'quantity', 'packaging', 'packaging_tags', 'brands', 'brands_tags', 'categories', 'categories_tags', 'categories_fr', 'origins', 'origins_tags', 'manufacturing_places', 'manufacturing_places_tags', 'labels', 'labels_tags', 'labels_fr', 'emb_codes', 'emb_codes_tags', 'first_packaging_code_geo', 'cities', 'cities_tags', 'purchase_places', 'stores', 'countries', 'countries_tags', 'countries_fr', 'ingredients_text', 'allergens', 'allergens_fr', 'traces', 'traces_tags', 'traces_fr', 'serving_size', 'additives', 'additives_tags', 'additives_fr', 'ingredients_from_palm_oil_tags', 'ingredients_that_may_be_from_palm_oil_tags', 'nutrition_grade_fr', 'pnns_groups_1', 'pnns_groups_2', 'states', 'states_tags', 'states_fr', 'main_category', 'main_category_fr', 'image_url', 'image_small_url']
print("non(categorical features:",len(data.describe(exclude="object").columns.tolist()))
print(data.describe(exclude="object").columns.tolist())
non(categorical features: 106 ['no_nutriments', 'additives_n', 'ingredients_from_palm_oil_n', 'ingredients_from_palm_oil', 'ingredients_that_may_be_from_palm_oil_n', 'ingredients_that_may_be_from_palm_oil', 'nutrition_grade_uk', 'energy_100g', 'energy-from-fat_100g', 'fat_100g', 'saturated-fat_100g', 'butyric-acid_100g', 'caproic-acid_100g', 'caprylic-acid_100g', 'capric-acid_100g', 'lauric-acid_100g', 'myristic-acid_100g', 'palmitic-acid_100g', 'stearic-acid_100g', 'arachidic-acid_100g', 'behenic-acid_100g', 'lignoceric-acid_100g', 'cerotic-acid_100g', 'montanic-acid_100g', 'melissic-acid_100g', 'monounsaturated-fat_100g', 'polyunsaturated-fat_100g', 'omega-3-fat_100g', 'alpha-linolenic-acid_100g', 'eicosapentaenoic-acid_100g', 'docosahexaenoic-acid_100g', 'omega-6-fat_100g', 'linoleic-acid_100g', 'arachidonic-acid_100g', 'gamma-linolenic-acid_100g', 'dihomo-gamma-linolenic-acid_100g', 'omega-9-fat_100g', 'oleic-acid_100g', 'elaidic-acid_100g', 'gondoic-acid_100g', 'mead-acid_100g', 'erucic-acid_100g', 'nervonic-acid_100g', 'trans-fat_100g', 'cholesterol_100g', 'carbohydrates_100g', 'sugars_100g', 'sucrose_100g', 'glucose_100g', 'fructose_100g', 'lactose_100g', 'maltose_100g', 'maltodextrins_100g', 'starch_100g', 'polyols_100g', 'fiber_100g', 'proteins_100g', 'casein_100g', 'serum-proteins_100g', 'nucleotides_100g', 'salt_100g', 'sodium_100g', 'alcohol_100g', 'vitamin-a_100g', 'beta-carotene_100g', 'vitamin-d_100g', 'vitamin-e_100g', 'vitamin-k_100g', 'vitamin-c_100g', 'vitamin-b1_100g', 'vitamin-b2_100g', 'vitamin-pp_100g', 'vitamin-b6_100g', 'vitamin-b9_100g', 'folates_100g', 'vitamin-b12_100g', 'biotin_100g', 'pantothenic-acid_100g', 'silica_100g', 'bicarbonate_100g', 'potassium_100g', 'chloride_100g', 'calcium_100g', 'phosphorus_100g', 'iron_100g', 'magnesium_100g', 'zinc_100g', 'copper_100g', 'manganese_100g', 'fluoride_100g', 'selenium_100g', 'chromium_100g', 'molybdenum_100g', 'iodine_100g', 'caffeine_100g', 'taurine_100g', 'ph_100g', 'fruits-vegetables-nuts_100g', 'collagen-meat-protein-ratio_100g', 'cocoa_100g', 'chlorophyl_100g', 'carbon-footprint_100g', 'nutrition-score-fr_100g', 'nutrition-score-uk_100g', 'glycemic-index_100g', 'water-hardness_100g']
The application will be the following:
To create this app, we need the following features:
At the end we have to be able to say if the appilcation is possible with such data
DataApp=data[["product_name","countries","image_url","proteins_100g","fat_100g","carbohydrates_100g","energy_100g"]].copy()
print(np.round((100*DataApp.isna().sum().sum())/(DataApp.isna().sum().sum()+DataApp.notna().sum().sum())),"% of missing data")
24.0 % of missing data
DataApp.dropna().head(2)
product_name | countries | image_url | proteins_100g | fat_100g | carbohydrates_100g | energy_100g | |
---|---|---|---|---|---|---|---|
106 | Lion Peanut x2 | France, US | http://fr.openfoodfacts.org/images/products/00... | 2.50 | 20.00 | 70.00 | 1883.0 |
138 | Pack de 2 Twix | France, US | http://fr.openfoodfacts.org/images/products/00... | 6.25 | 4.17 | 77.08 | 1481.0 |
DataApp.describe()
proteins_100g | fat_100g | carbohydrates_100g | energy_100g | |
---|---|---|---|---|
count | 259922.000000 | 243891.000000 | 243588.000000 | 2.611130e+05 |
mean | 7.075940 | 12.730379 | 32.073981 | 1.141915e+03 |
std | 8.409054 | 17.578747 | 29.731719 | 6.447154e+03 |
min | -800.000000 | 0.000000 | 0.000000 | 0.000000e+00 |
25% | 0.700000 | 0.000000 | 6.000000 | 3.770000e+02 |
50% | 4.760000 | 5.000000 | 20.600000 | 1.100000e+03 |
75% | 10.000000 | 20.000000 | 58.330000 | 1.674000e+03 |
max | 430.000000 | 714.290000 | 2916.670000 | 3.251373e+06 |
Impossible values like the negative one, we have to change that
DataApp = DataApp[DataApp.eval("proteins_100g <100 & proteins_100g >0 & fat_100g <100 & fat_100g >0 & carbohydrates_100g <100 & carbohydrates_100g >0 & energy_100g>0 & energy_100g<900" )]
DataApp.describe()
proteins_100g | fat_100g | carbohydrates_100g | energy_100g | |
---|---|---|---|---|
count | 52919.000000 | 52919.000000 | 52919.000000 | 52919.000000 |
mean | 5.974644 | 4.508432 | 11.967541 | 462.036901 |
std | 6.068280 | 4.506851 | 9.943370 | 230.928718 |
min | 0.010000 | 0.000100 | 0.010000 | 0.420000 |
25% | 1.770000 | 1.050000 | 4.800000 | 268.000000 |
50% | 3.570000 | 3.010000 | 10.000000 | 444.000000 |
75% | 7.600000 | 6.900000 | 16.430000 | 649.000000 |
max | 85.700000 | 83.800000 | 98.000000 | 899.000000 |
DataApp
product_name | countries | image_url | proteins_100g | fat_100g | carbohydrates_100g | energy_100g | |
---|---|---|---|---|---|---|---|
124 | Organic Wheat Bran | US | NaN | 13.33 | 3.33 | 66.67 | 837.0 |
185 | Quiche Lorraine | Canada | NaN | 5.36 | 6.79 | 7.86 | 478.0 |
186 | Pâté au poulet | Canada | NaN | 33.90 | 10.20 | 16.60 | 751.0 |
236 | Cauliflower | United Kingdom | http://fr.openfoodfacts.org/images/products/00... | 3.60 | 0.90 | 2.90 | 144.0 |
239 | Salsa de mostaza | España | http://fr.openfoodfacts.org/images/products/00... | 3.60 | 3.50 | 7.50 | 320.0 |
... | ... | ... | ... | ... | ... | ... | ... |
320668 | 100% Pur Jus 4 agrumes | France | NaN | 0.60 | 0.50 | 10.20 | 192.0 |
320686 | Haywards, Baby Beetroot | US | NaN | 0.90 | 0.50 | 11.00 | 866.0 |
320693 | Santa Cruz Chilli & Lime Dressing | United Kingdom | http://fr.openfoodfacts.org/images/products/96... | 0.30 | 6.90 | 23.10 | 660.0 |
320756 | Test NF App | en:CH | NaN | 2.10 | 31.00 | 12.20 | 569.0 |
320763 | Thé vert Earl grey | France | http://fr.openfoodfacts.org/images/products/99... | 0.50 | 0.20 | 0.50 | 21.0 |
52919 rows × 7 columns
DataApp.shape
(52919, 7)
DataApp.isnull().sum().sum()
36786
DataApp.dropna(inplace=True)
DataApp.shape
/usr/local/lib/python3.8/dist-packages/pandas/util/_decorators.py:311: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy return func(*args, **kwargs)
(17108, 7)
DataApp.describe(exclude="object")
proteins_100g | fat_100g | carbohydrates_100g | energy_100g | |
---|---|---|---|---|
count | 17108.000000 | 17108.000000 | 17108.000000 | 17108.000000 |
mean | 5.676540 | 4.004240 | 10.504606 | 420.763909 |
std | 6.219364 | 4.457737 | 9.749204 | 227.108105 |
min | 0.010000 | 0.000100 | 0.030000 | 0.420000 |
25% | 1.300000 | 0.500000 | 3.800000 | 227.000000 |
50% | 3.400000 | 2.600000 | 8.500000 | 401.000000 |
75% | 7.200000 | 6.000000 | 14.300000 | 591.000000 |
max | 45.000000 | 73.000000 | 97.700000 | 899.000000 |
taking care of outliers:
from scipy import stats
def find_outliers(data,var):
iqr=stats.iqr(data[var])
q1=data[var].quantile(0.25)
q3=data[var].quantile(0.75)
lower=q1-iqr*1.5
upper=q3+iqr*1.5
dfOutliers=data[(data[var]<lower) | (data[var]>upper)]
return dfOutliers
dfOutliersProteins=find_outliers(DataApp,"proteins_100g")
dfOutliersCarbs=find_outliers(DataApp,"carbohydrates_100g")
dfOutliersFat=find_outliers(DataApp,"fat_100g")
dfOutliersEnergy=find_outliers(DataApp,"energy_100g")
sns.distplot(DataApp['proteins_100g'].drop(labels=dfOutliersProteins.index.tolist(),axis=0),kde=True)
plt.gcf().set_size_inches(11.7, 8.27)
plt.title("histogramme et densité de probabilité de la variable proteins_100g")
plt.show()
/usr/local/lib/python3.8/dist-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
sns.boxplot(DataApp['proteins_100g'],showfliers=False).set_title('distribution');
plt.gcf().set_size_inches(11.7, 6.27)
/usr/local/lib/python3.8/dist-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
sns.distplot(DataApp['carbohydrates_100g'].drop(labels=dfOutliersCarbs.index.tolist(),axis=0),kde=True)
plt.gcf().set_size_inches(11.7, 8.27)
plt.title("histogramme et densité de probabilité de la variable proteins_100g")
plt.show()
/usr/local/lib/python3.8/dist-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
sns.boxplot(DataApp['carbohydrates_100g'],showfliers=False).set_title('distribution');
plt.gcf().set_size_inches(11.7, 6.27)
/usr/local/lib/python3.8/dist-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
sns.distplot(DataApp['fat_100g'].drop(labels=dfOutliersFat.index.tolist(),axis=0),kde=True)
plt.gcf().set_size_inches(11.7, 8.27)
plt.title("histogramme et densité de probabilité de la variable proteins_100g")
plt.show()
/usr/local/lib/python3.8/dist-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
sns.boxplot(DataApp['fat_100g'],showfliers=False).set_title('distribution');
plt.gcf().set_size_inches(11.7, 6.27)
/usr/local/lib/python3.8/dist-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
sns.distplot(DataApp['energy_100g'].drop(labels=dfOutliersEnergy.index.tolist(),axis=0),kde=True)
plt.gcf().set_size_inches(11.7, 8.27)
plt.title("histogramme et densité de probabilité de la variable proteins_100g")
plt.show()
/usr/local/lib/python3.8/dist-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
sns.boxplot(DataApp['energy_100g'],showfliers=False).set_title('distribution')
plt.gcf().set_size_inches(11.7, 8.27)
/usr/local/lib/python3.8/dist-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
#Avec tout les outliers....
mask = np.zeros_like(DataApp.corr())
mask[np.triu_indices_from(mask)] = True
plt.figure(figsize=(20,15))
plt.xticks(rotation=45)
sns.heatmap(DataApp.corr(),cmap="RdBu",mask=mask, annot=True)
<matplotlib.axes._subplots.AxesSubplot at 0x7f7eb942f940>
#By choosing a sample o f n=500, we have slightly different results but interessting ones
mask = np.zeros_like(DataApp.sample(n=500,random_state=2).corr())
mask[np.triu_indices_from(mask)] = True
plt.figure(figsize=(20,15))
sns.heatmap(DataApp.sample(n=500,random_state=2).corr(),cmap="RdBu",mask=mask, annot=True)
<matplotlib.axes._subplots.AxesSubplot at 0x7f7eb92d18e0>
#We remove outliers from energy and proteins
dataCurrentNoEnergyOutliers=DataApp.drop(labels=dfOutliersEnergy.index.tolist(),axis=0)
datacurrentNoProteinsOutliers=find_outliers(dataCurrentNoEnergyOutliers,"proteins_100g")
datacurrentNoEnergy_ProteinsOutliers=dataCurrentNoEnergyOutliers.drop(labels=datacurrentNoProteinsOutliers.index.tolist(),axis=0)
dfFranceUKProteins=datacurrentNoEnergy_ProteinsOutliers[(datacurrentNoEnergy_ProteinsOutliers["countries"]=="France") | (datacurrentNoEnergy_ProteinsOutliers["countries"]=="United Kingdom")]
fig,ax = plt.subplots(figsize=(50,25))
sns.scatterplot(data=dfFranceUKProteins,x="proteins_100g",y="energy_100g",hue="countries").set_title("Scatter Plot beetween France and United Kingdom")
Text(0.5, 1.0, 'Scatter Plot beetween France and United Kingdom')
print("correlation entre proteins et energy:")
stats.pearsonr(datacurrentNoEnergy_ProteinsOutliers['proteins_100g'], datacurrentNoEnergy_ProteinsOutliers['energy_100g'])
correlation entre proteins et energy:
(0.5342409264440046, 0.0)
sns.kdeplot(x="proteins_100g",y="energy_100g",cmap="Reds",shade=True,data=datacurrentNoEnergy_ProteinsOutliers.sample(n=5000,random_state=2)).set_title("Joint density of proteins and energy");
plt.xlim(0, 40)
plt.ylim(None, 1000)
(-164.40015761582123, 1000.0)
However, some factors must be taken into account, such as country differences, accessibility of products by country, photos, etc.